Title analysis: titles of male and female speakers
Title analysis
data.tit <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
header=T, as.is=T)
data.tit$date <- dmy(data.tit$date)
data.tit$year <- year(data.tit$date)
#skimr::skim(data.tit)Excluding special events as round tables and discussions not related to a project or study presented by someone.
IDs <- c(154, 250, 211, 289)
data.tit <- data.tit %>% filter(!id %in% IDs) %>% filter(!is.na(title_english))table(data.tit$gender)##
## F M
## 144 185
table(data.tit$position_cat, data.tit$gender)##
## F M
## others 4 1
## postdoc 25 32
## professor 25 75
## student 88 75
Formating tidytext
tit <- data.tit %>% dplyr::select(id,gender,position_cat, audience_n,
title_english)
text_tok <- tit %>% unnest_tokens(output=word,
input=title_english)Excluding stopwords, e.g. âandâ âorâ âtheâ âofâ âinâ.
Standardizing plurals.
# lista das stopwords em ingles
stop_w <- tibble(word = stopwords(source = "stopwords-iso"))
#retirar do corpus as stopwords
text <- text_tok %>%
anti_join(stop_w, by="word")
# retirar nĂșmeros e travessĂŁo e outras word
remover <- c("ăŒ", "1", "1st", "2", "364", "40", "70", "750", "aff", "da")
text <- text %>% filter(!word %in% remover )
# resolvendo plurais simples - sĂł cortando o S
plural <- c("actions","advances", "adaptations", "amphibians", "animals", "ants","anurans",
"applications","approaches", "bees","builds", "birds",
"cerrados","challenges",
"continents","crops",
"decisions","declines","determines","determinants", "defenses",
"dynamics",
"economics", "ecosystems","environments", "experiences",
"forests",
"genetics","gifts","gradients","guides","impacts",
"increases","interactions","lives",
"landscapes","males","mammals", "mangroves","models","movements",
"mutualisms","networks","neotropics",
"opilions","phenotypes","plants","projects","paths", "perspectives",
"populations","promotes","relationships", "relations",
"resources","responses","roads","services","skulls","snakes","seeds",
"spaces", "spiders","stages", "trees", "variations",
"threats")
text$word[text$word %in% plural] <-
substr(text$word[text$word %in% plural],
1,nchar(text$word[text$word %in% plural])-1)- Grouping similar words:
lemma <- rbind(c("adaptive", "adaptation"),
c("advancement", "advance"),
c("agricultural", "agriculture"),
c("agro", "agriculture" ),
c("amazonia","amazon" ),
c("amazonian","amazon" ),
c("andean","andes"),
c("apply","application"),
c("applying","application"),
c("apidae","apis"),
c("arachnida","arachnid"),
c("argue","argument"),
c("basal", "basis"),
c("behavioral","behavior"),
c("behavioural","behavior"),
c("bignonieae", "bignoniaceae"),
c("biological", "biology"),
c("brazilian","brazil"),
c("building","build"),
c("changing", "change"),
c("cnidarian", "cnidaria"),
c("coastal","coast"),
c("colour", "color"),
c("colors", "color"),
c("communities","community" ),
c("competitive", "competition"),
c("complexity", "complex"),
c("convergences", "convergence"),
c("convergent", "convergence"),
c("cordatus","cordata.tit" ),
c("croplands","crop"),
c( "cultural", "culture"),
c("darwin's", "darwin"),
c("darwinian", "darwin"),
c("defensive", "defense"),
c("dependent","dependence"),
c("detecting","detection"),
c("determine", "determinant"),
c("developmental", "development"),
c("dispersers","dispersal"),
c("disturbed", "disturbance"),
c("diversification", "diversity"),
c("dragonflies", "dragonfly"),
c("drier", "drought"),
c("ecological", "ecology"),
c("ecologists", "ecology"),
c("endemic", "endemism"),
c("effectiveness", "efficiency"),
c("environmental", "environment"),
c("evolutionary", "evolution"),
c("expanding", "expansion"),
c("extinct", "extinction"),
c("facilitate", "facilitation"),
c("fisheries", "fishery"),
c("floral", "flora"),
c("floristic", "flora"),
c("forested", "forest"),
c("functional", "function"),
c("functionally", "function"),
c("functioning", "function"),
c("geographical", "geographic"),
c("heterogeneties", "heterogeneity"),
c("heterogeneous", "heterogeneity"),
c("histories", "history"),
c("integrated", "integration"),
c("intregating", "integration"),
c("integrative", "integration"),
c("invasive", "invasion"),
c("isotopic", "isotope"),
c("linking", "link"),
c("living", "live"),
c("mammalia", "mammal"),
c("managed", "manage"),
c("managers", "manage"),
c("mathematical", "mathematics"),
c("mates", "mating"),
c("mediated", "mediate"),
c("mechanistic", "mechanism"),
c("matrices", "matrix"),
c("migratory", "migration"),
c("mimicking", "mimicry"),
c("modeling", "model"),
c("mutualistic", "mutualism"),
c("natural", "nature"),
c("neotropical", "neotropic"),
c("northeastern", "northeast"),
c("occuring", "occur"),
c("onça", "onca"),
c("opiliones", "opilion"),
c("parasite", "parasitism"),
c("parent", "parenting"),
c("phylogenies", "phylogeny"),
c("phylogenetic", "phylogeny"),
c("phylogenomic", "phylogeny"),
c("pollinators", "pollination"),
c("protected", "protect"),
c("protective", "protect"),
c("rainfall", "rain"),
c("reconstructing", "reconstruction"),
c("regulatory", "regulation"),
c("regulates", "regulation"),
c("relation", "relationship"),
c("reproductive", "reproduction"),
c("restored", "restoration"),
c("robustness", "robust"),
c("scientific", "science"),
c("scientist", "science"),
c("sexy", "sexual"),
c("simulated", "simulation"),
c("societies", "society"),
c("social", "society"),
c("socio", "society"),
c("space", "spatial"),
c("spacio", "spatial"),
c("stabilize", "stability"),
c("stable", "stability"),
c("stories", "story"),
c("strategic", "strategy"),
c("strategies", "strategy"),
c("structured", "structure"),
c("structuring", "structure"),
c("studies", "study"),
c("studing", "study"),
c("sustainable", "sustainability"),
c("theories", "theory"),
c("theoretical", "theory"),
c("threatened", "threat"),
c("tropical", "tropic"),
c("vision", "visual")
)
lemma <- as.data.frame(lemma)
for (i in 1:dim(lemma)[1]){
text$word[text$word == lemma[i,1]] <- lemma[i,2]
}Counting words Frequency by gender
Removing stopwords, we keep 2340 words.
table(text$gender)##
## F M
## 1082 1258
table(text$position_cat, text$gender)##
## F M
## others 16 10
## postdoc 179 230
## professor 172 452
## student 703 554
pala <- text %>%
count(word) 20 palavra mais comuns
text %>%
count(word, sort = TRUE) %>%
top_n(20,n)%>%
kable()| word | n |
|---|---|
| ecology | 50 |
| forest | 42 |
| evolution | 32 |
| landscape | 27 |
| bird | 22 |
| model | 22 |
| diversity | 21 |
| environment | 21 |
| species | 21 |
| plant | 18 |
| structure | 17 |
| atlantic | 15 |
| brazil | 15 |
| effects | 15 |
| conservation | 14 |
| interaction | 13 |
| study | 13 |
| bee | 12 |
| community | 12 |
| network | 12 |
| patterns | 12 |
| sĂŁo | 12 |
word cloud
textplot_wordcloud(x=dfm(tokens(text$word)))par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="F"])),
col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="M"])),
col="#FCA532")Word frequencies by gender
props <- text %>%
count(gender, word) %>%
group_by(gender) %>%
mutate(proportion = n / sum(n)) %>%
pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
mutate(abs.dif.p = abs(proportion_F-proportion_M),
rel.dif.p = pmax(proportion_F, proportion_M)/
pmin(proportion_F, proportion_M)) %>%
arrange(desc(abs.dif.p))
props$label <- NA
props$label[1:20] <- props$word[1:20]ggplot(props, aes(x=proportion_M,, y=proportion_F,
color=abs.dif.p)) +
geom_abline(color = "gray40", lty = 2) +
#geom_point(size=2.5, alpha=0.5)+
geom_jitter(size=2.5, alpha=0.5)+
geom_text_repel(aes(label=label), size=3)+
scale_x_log10(name="Male most used words",
labels = percent_format(), limits=c(0.0005,0.03)) +
scale_y_log10(name="Female Most used words",
labels = percent_format(),limits=c(0.0005,0.03)) +
scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
labels=percent_format()) +
theme(legend.justification = c(1, -0.1), legend.position = c(1, 0)) # geom_smooth(method="lm")
ggsave("figures/title_wordFreq.jpg", height = 5, width=7)Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.
Legend: absolute differences in the frequency of the word by males and females. Top 20 words by absolute differences are also indicated in text.
Correlation of word frequeency use between gender:
cor.test(props$proportion_F, props$proportion_M)##
## Pearson's product-moment correlation
##
## data: props$proportion_F and props$proportion_M
## t = 15.393, df = 236, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6380756 0.7660220
## sample estimates:
## cor
## 0.7078068
Highly correlated -> it means they tend to use the same frequency of main words
prop2 <- props %>% filter(!is.na(label)) %>%
arrange(desc(proportion_F), desc(proportion_M)) %>%
mutate(ntot = n_F + n_M) %>%
mutate(word = fct_reorder(word,(ntot),max),
proportion_F = proportion_F*-1) %>%
pivot_longer(2:3,names_to = "gender", values_to ="proportion")
ggplot(prop2, aes(x=proportion, y=word,fill=gender)) +
geom_col()+ ylab("") + xlab("Proportion")+
scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
labels=c("F", "M"))+
geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
linetype="dotted",
col="darkgray") +
scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
labels = c(0.02,0.01,0,0.01,0.02))ggsave("figures/title_wordFreq_barplot.jpeg", units="in", width=7, height=7, dpi=300)PROFESSOR Word frequencies by gender
propsP <- text %>% filter(position_cat == "professor")%>%
count(gender, word) %>%
group_by(gender) %>%
mutate(proportion = n / sum(n)) %>%
pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
mutate(abs.dif.p = abs(proportion_F-proportion_M),
rel.dif.p = pmax(proportion_F, proportion_M)/
pmin(proportion_F, proportion_M)) %>%
arrange(desc(abs.dif.p))
propsP$label <- NA
propsP$label[1:20] <- propsP$word[1:20]ggplot(propsP, aes(x=proportion_M, y=proportion_F,
color=abs.dif.p)) +
geom_abline(color = "gray40", lty = 2) +
# geom_point(size=2.5, alpha=0.3) +
geom_jitter(size=2.5, alpha=0.3)+
geom_text_repel(aes(label=label), size=3)+
scale_x_log10(name="Male most used words",
labels = percent_format()) +
scale_y_log10(name="Female Most used words",
labels = percent_format()) +
scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
labels=percent_format()) +
theme(legend.justification = c(1, -0.1), legend.position = c(1, 0)) # geom_smooth(method="lm")
ggsave("figures/abstract_wordFreq_Prof.jpg", height = 5, width=7)Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.
Legend: absolute differences in the frequency of the word by males and females. Differences above 0.3% are also indicated in text.
Correlation of word frequeency use between gender:
cor.test(propsP$proportion_F, propsP$proportion_M)##
## Pearson's product-moment correlation
##
## data: propsP$proportion_F and propsP$proportion_M
## t = 1.3985, df = 47, p-value = 0.1685
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.0861594 0.4554761
## sample estimates:
## cor
## 0.1998798
No correlation
20 words with the largest differences in frequency
propP2 <- propsP %>% filter(!is.na(label)) %>%
arrange(desc(proportion_F), desc(proportion_M)) %>%
mutate(ntot = n_F + n_M) %>%
mutate(word = fct_reorder(word,(ntot),max),
proportion_F = proportion_F*-1) %>%
pivot_longer(2:3,names_to = "gender", values_to ="proportion")
ggplot(propP2, aes(x=proportion, y=word,fill=gender)) +
geom_col()+ ylab("") + xlab("Proportion")+
scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
labels=c("F", "M"))+
geom_vline(xintercept = c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03),
linetype="dotted",
col="darkgray") +
scale_x_continuous(breaks=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03),
labels = c(0.03,0.02,-0.01,0,0.01,0.02,0.03))ggsave("figures/title_wordFreq_barplot_Prof.jpeg", units="in", width=7, height=7, dpi=300)TF IDF
OBS: essas anaâlises nao ajudaram muito, tlvz nem precisem mais ficar aqui
text_id <- text %>% count(gender, word) %>%
bind_tf_idf(word, gender, n) %>%
arrange(desc(tf_idf))
#text_idtext_id$word <- as.factor(text_id$word)
text_id %>%
group_by(gender) %>%
arrange(desc(tf_idf)) %>%
top_n(5, tf_idf) %>%
ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~gender, scales = "free") +
theme_minimal()TF IDF professors
OBS: essas anaâlises nao ajudaram muito, tlvz nem precisem mais ficar aqui
text_idP <- text %>% filter(position_cat== "professor") %>%
count(gender, word) %>%
bind_tf_idf(word, gender, n) %>%
arrange(desc(tf_idf))
#text_idtext_idP$word <- as.factor(text_idP$word)
text_idP %>%
group_by(gender) %>%
arrange(desc(tf_idf)) %>%
top_n(5, tf_idf) %>%
ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~gender, scales = "free") +
theme_minimal()Topic model
matext <- text %>% count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
select(-gender) %>%
cast_dtm(term=word,document=id,value=n)
ap_lda2 <- LDA(matext, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matext, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matext, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,base=T)## AIC dAIC df
## ap_lda3 34358.6 0.0 3184
## ap_lda2 34405.8 47.2 2123
## ap_lda4 35971.6 1613.0 4245
word-topic probabilities
ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()Document-topic probabilities
ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
group_by(document,gender) %>%
top_n(1, gamma)
table(classifi$gender, classifi$topic)##
## 1 2
## F 66 71
## M 85 96
classifi %>% tabyl(gender, topic) %>% adorn_percentages() %>%
adorn_pct_formatting(digits = 0) ## gender 1 2
## F 48% 52%
## M 47% 53%
classifi %>%
# mutate(title = reorder(title, gamma * topic)) %>%
ggplot(aes(as.character(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ gender)Topic model Professors only
matext <- text %>% filter(position_cat=="professor") %>%
count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
select(-gender) %>%
cast_dtm(term=word,document=id,value=n)
ap_lda2 <- LDA(matext, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matext, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matext, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,base=T)## AIC dAIC df
## ap_lda2 8798.6 0.0 807
## ap_lda3 8825.7 27.0 1210
## ap_lda4 9444.3 645.7 1613
word-topic probabilities
ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()Document-topic probabilities
ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
group_by(document,gender) %>%
top_n(1, gamma)
table(classifi$gender, classifi$topic)##
## 1 2
## F 10 15
## M 34 40
classifi %>% tabyl(gender, topic) %>% adorn_percentages() %>%
adorn_pct_formatting(digits = 0) ## gender 1 2
## F 40% 60%
## M 46% 54%
classifi %>%
# mutate(title = reorder(title, gamma * topic)) %>%
ggplot(aes(as.character(topic), gamma)) +
geom_boxplot() +
geom_violin()+
facet_wrap(~ gender)ABSTRAC - semitniment analysis
Chapter 2, Silge & RObinson. 2018
- The NRC lexiâ con categorizes words in a binary fashion (âyesâ/ânoâ) into categories of positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust.
get_sentiments("nrc")## # A tibble: 13,875 Ă 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ⊠with 13,865 more rows
- The Bing lexicon categorizes words in a binary fashion into positive and negative categories.
get_sentiments("bing")## # A tibble: 6,786 Ă 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ⊠with 6,776 more rows
- The AFINN lexicon assigns words with a score that runs between -5 and 5, with negâ ative scores indicating negative sentiment and positive scores indicating positive senâ timent.
get_sentiments("afinn")## # A tibble: 2,477 Ă 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ⊠with 2,467 more rows
- Another one in the package
get_sentiments("loughran")## # A tibble: 4,150 Ă 2
## word sentiment
## <chr> <chr>
## 1 abandon negative
## 2 abandoned negative
## 3 abandoning negative
## 4 abandonment negative
## 5 abandonments negative
## 6 abandons negative
## 7 abdicated negative
## 8 abdicates negative
## 9 abdicating negative
## 10 abdication negative
## # ⊠with 4,140 more rows
Score words difference in female and male abstracts
affword <- get_sentiments("afinn")
affc <- text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(affword, "word")affc %>% group_by(id, gender) %>%
summarise(mean.score = mean(value),
weig.score = weighted.mean(value,n)) %>%
ggplot(aes(x=gender,y=mean.score)) +
geom_violin() +
geom_boxplot(width=0.1) +
ggtitle("Mean words score per abstract and gender") #ggbeeswarm::geom_beeswarm(size=3, shape=21)affc %>% group_by(id, gender) %>%
summarise(mean.score = mean(value),
weig.score = weighted.mean(value,n)) %>%
ggplot(aes(x=gender,y=weig.score)) +
geom_violin() +
geom_boxplot(width=0.1) +
ggtitle("Weighted mean words score per abstract and gender") #ggbeeswarm::geom_beeswarm(size=3, shape=21)Frequency of sentiment words per abstract
As classificaçÔes das palavras nĂŁo me parecem muito acuradas com a linguagem cientĂfica
nrcword <- get_sentiments("nrc")
nrc <- text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(nrcword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(nrc, aes(x=gender, y=n)) +
facet_wrap(~sentiment) +
geom_violin()Frequency of sentiment words per abstract
As classificaçÔes das palavras nĂŁo me parecem muito acuradas com a linguagem cientĂfica
bingword <- get_sentiments("bing")
bing <- text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(bingword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(bing, aes(x=sentiment, y=n)) +
facet_wrap(~gender) +
geom_violin()Frequency of sentiment words per abstract
As classificaçÔes das palavras nĂŁo me parecem muito acuradas com a linguagem cientĂfica
louword <- get_sentiments("loughran")
lou <- text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(louword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(bing, aes(x=sentiment, y=n)) +
facet_wrap(~gender) +
geom_violin()